#Required libraries

# Tidyverse for data science and exploration
require(dplyr)
require(tidyr)
require(readr)
require(tibble)
require(stringr)
require(purrr)
require(forcats)
require(rlang)

# enhances tidyverse
require(tidylog) # additional logging
require(magrittr) # additional data pipe syntax


# for reading data in multiple formats
require(readxl)
require(haven)

# visual analysis
require(ggplot2)
require(GGally) # extensions to ggplot
require(gt) # well formatted tables
# client-side interactive publishable graphics
require(plotly)
require(leaflet)
require(crosstalk)
require(htmlwidgets)
# server-side interactive graphics
require(shiny)
require(shinyjs)
# Canned Interactive EDA 
require(ExPanDaR)

Exploring KU Book Processing Charges

# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
Parsed with column specification:
cols(
  institution = col_character(),
  period = col_double(),
  euro = col_double(),
  doi = col_character(),
  backlist_oa = col_logical(),
  publisher = col_character(),
  book_title = col_character(),
  isbn = col_character(),
  isbn_print = col_character(),
  isbn_electronic = col_character(),
  license_ref = col_character(),
  indexed_in_crossref = col_logical(),
  doab = col_logical()
)
# read DOAB metadata

source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
embedded nul(s) found in input


head(KUbpc.df)
head(summary(KUbpc.df))
 institution            period          euro          doi           
 Length:938         Min.   :2017   Min.   :1075   Length:938        
 Class :character   1st Qu.:2017   1st Qu.:1875   Class :character  
 Mode  :character   Median :2018   Median :1981   Mode  :character  
                    Mean   :2018   Mean   :4368                     
                    3rd Qu.:2019   3rd Qu.:8250                     
                    Max.   :2020   Max.   :8978                     
 backlist_oa      publisher          book_title            isbn          
 Mode :logical   Length:938         Length:938         Length:938        
 FALSE:357       Class :character   Class :character   Class :character  
 TRUE :581       Mode  :character   Mode  :character   Mode  :character  
                                                                         
                                                                         
                                                                         
  isbn_print        isbn_electronic    license_ref       
 Length:938         Length:938         Length:938        
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
 indexed_in_crossref    doab        
 Mode :logical       Mode :logical  
 FALSE:127           FALSE:44       
 TRUE :811           TRUE :894      
                                    
                                    
                                    
ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar() 


ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

General Exploratory Data Analysis


ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))


# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)


# publisher_euro <- KUbpc.df %>% 
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)

# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)

ggplotly(institution_euro)

NA

Idea: Publishers vs. Charges

Question: How do the top 25% of publishers divide up charges (in Euro)?

Observation: Charges are grouped around ~2000 Euros and ~8000 Euros.


publisher_counts <- KUbpc.df %>%
    group_by(publisher) %>%
    tally
tally: now 110 rows and 2 columns, ungrouped
sorted_counts = arrange(publisher_counts, desc(n))

total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])

sorted_counts %>% filter(n > 24)

# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)

filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
                     KUbpc.df$publisher == 'Duke University Press' |
                     KUbpc.df$publisher == 'University of Michigan Press' |
                     KUbpc.df$publisher == 'Manchester University Press' |
                     KUbpc.df$publisher == 'Pluto Press' |
                     KUbpc.df$publisher == 'Liverpool University Press')

head(filtered)

euro_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro), 
         aes(x = filtered$publisher, y = filtered$euro)) + 
  # geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))

# ggplot:
ggplotly(euro_publisher)


# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)


# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)

Idea: Publishers’ Charges vs. Year/OA Type

Sub-Question: What best explains the particular division of charges? (Year, OA Type)

Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year.


head(filtered)

# Does Type of OA impact the particular division of charges?

euro_oa_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro), 
         aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_oa_publisher)


# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)



# Does Year impact the particular division of charges?

euro_year_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro), 
         aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_year_publisher)


# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)

NA
NA

Idea: Publishers vs. OA

Question: What type of business model do the top 25% publishers use?

Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).


oa_type <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
  geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_color_brewer(palette = "Set1")

ggplotly(oa_type)


# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
  geom_bar(position = "fill", width = 0.7) +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
#   filter_select("publisher", "Select a publisher", ft, ~publisher),
#   ggplotly(oa_ft, dynamicTicks = TRUE),
#   # widths = c(12, 12)
# )

# bscols(cross_oa_ft)

Idea: Publishers’ OA vs. Year

Question: Did OA business models of the top 25% publishers change per year?

Observation:


oa_time <- function(pub_name) {
  pub_ft <- filter(filtered, filtered$publisher == pub_name)
  
  pub_oa <- pub_ft %>% 
    ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
    geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
    labs(title = paste(pub_name, "'s OA Through the Years", sep = ""), 
         x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
    theme(axis.text = element_text(size = rel(0.75))) +
    scale_x_discrete(limits=c(2017, 2018, 2019)) +
    scale_color_brewer(palette = "Set1")

  ggplotly(pub_oa)
  
}

top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")

oa_time("transcript Verlag")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Duke University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("University of Michigan Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Manchester University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Pluto Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Liverpool University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

Idea: Revenue vs. OA

Question: What total revenue are publishers receiving each year?

Observation:


# Finding total revenue for each publisher

revenue_finder <- function(pub_name) {
  pub_filtered <- filter(filtered, filtered$publisher == pub_name)
  rev = sum(pub_filtered$euro)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()

for (i in top25_list) {
  revenue_list<-c(revenue_list,revenue_finder(i))
}

revenue_df$revenue <- c(revenue_list)
print(revenue_df)

# ggplot:
publisher_revenue <- revenue_df %>%
  ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
  geom_col() +
  labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_fill_brewer(palette = "Set1")

ggplotly(publisher_revenue)
Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.Use of `revenue_df$revenue` is discouraged. Use `revenue` instead.

Idea: Revenue vs. OA

Question: What revenue are publishers receiving per year?

Observation:


# Finding total revenue for each publisher

revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

revlist <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)

print(revenue_df)

revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')

# ggplot:
pub_year_revenue1 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue1)
Use of `revenue_df$"2017"` is discouraged. Use `2017` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

pub_year_revenue2 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue2)
Use of `revenue_df$"2018"` is discouraged. Use `2018` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

pub_year_revenue3 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue3)
Use of `revenue_df$"2019"` is discouraged. Use `2019` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

Continued, tried putting it into one graph.


revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revlist <- c(revlist_2017, revlist_2018, revlist_2019)

print(revlist)
 [1] 127420 153491  43044  96849  51824  48131  58125  61875  54750  36000
[11]  53625  21375  51750  58125  36000  17625  40500  42375
nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)

print(nrev)

#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`

pub_year_rev <- nrev %>%
  
  ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_rev)
Use of `nrev$"transcript Verlag"` is discouraged. Use `transcript Verlag` instead.Use of `nrev$publisher` is discouraged. Use `publisher` instead.

Idea: DOAB analysis

Question: What is the average time gap between year of publication and added on date?

Observation:


DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
[1] NA NA NA NA
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
Error in DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3] : 
  non-numeric argument to binary operator

Comparison of charges by year and backlist

# create faceted plot object
charges.plot <- KUbpc.df %>% ggplot(aes(euro))+geom_histogram(bins=6)+facet_grid(rows=vars(period), cols = vars(backlist_oa))


## Present as Standard plot
 plot(charges.plot)


# this plot will render publicly https://htmlpreview.github.io/?https://github.com/MIT-Informatics/monograph/blob/master/00%20EDA%20Start.nb.html

Interactive charges exploration

 ggplotly(charges.plot)

# https://mit-informatics.github.io/monograph/demo.html
### Interactive Dataset Exploration 
KUbpc.df %>% ExPanD(df=.       ,title="KU Book Processing Charges",export_nb_option = TRUE)
# ExPanD uses shiny() which works running R locally, but isn't going to work through github. Could publish through shinyapps.io (low usage only), or export  a non-interactive notebook it
# see: https://drmaltman.shinyapps.io/demo/
---
title: "Exploratory Analysis"
output: html_notebook
---

```{r}
#Required libraries

# Tidyverse for data science and exploration
require(dplyr)
require(tidyr)
require(readr)
require(tibble)
require(stringr)
require(purrr)
require(forcats)
require(rlang)

# enhances tidyverse
require(tidylog) # additional logging
require(magrittr) # additional data pipe syntax


# for reading data in multiple formats
require(readxl)
require(haven)

# visual analysis
require(ggplot2)
require(GGally) # extensions to ggplot
require(gt) # well formatted tables
# client-side interactive publishable graphics
require(plotly)
require(leaflet)
require(crosstalk)
require(htmlwidgets)
# server-side interactive graphics
require(shiny)
require(shinyjs)
# Canned Interactive EDA 
require(ExPanDaR)


```
## Exploring KU Book Processing Charges
```{r  }
# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
# read DOAB metadata

source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
```
```{r  }


head(KUbpc.df)
head(summary(KUbpc.df))

ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar() 

ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

```
## General Exploratory Data Analysis
```{r  }

ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))

# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)

# publisher_euro <- KUbpc.df %>% 
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)

# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)

ggplotly(institution_euro)

```
## Idea: Publishers vs. Charges
## Question: How do the top 25% of publishers divide up charges (in Euro)?
## Observation: Charges are grouped around ~2000 Euros and ~8000 Euros. 
```{r  }

publisher_counts <- KUbpc.df %>%
    group_by(publisher) %>%
    tally

sorted_counts = arrange(publisher_counts, desc(n))

total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])

sorted_counts %>% filter(n > 24)

# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)

filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
                     KUbpc.df$publisher == 'Duke University Press' |
                     KUbpc.df$publisher == 'University of Michigan Press' |
                     KUbpc.df$publisher == 'Manchester University Press' |
                     KUbpc.df$publisher == 'Pluto Press' |
                     KUbpc.df$publisher == 'Liverpool University Press')

head(filtered)

euro_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro), 
         aes(x = filtered$publisher, y = filtered$euro)) + 
  # geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))

# ggplot:
ggplotly(euro_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)

# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)


```
## Idea: Publishers' Charges vs. Year/OA Type
## Sub-Question: What best explains the particular division of charges? (Year, OA Type)
## Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year. 
```{r  }

head(filtered)

# Does Type of OA impact the particular division of charges?

euro_oa_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro), 
         aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_oa_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


# Does Year impact the particular division of charges?

euro_year_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro), 
         aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_year_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


```
## Idea: Publishers vs. OA
## Question: What type of business model do the top 25% publishers use?
## Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).
```{r  }

oa_type <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
  geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_color_brewer(palette = "Set1")

ggplotly(oa_type)

# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
  geom_bar(position = "fill", width = 0.7) +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
#   filter_select("publisher", "Select a publisher", ft, ~publisher),
#   ggplotly(oa_ft, dynamicTicks = TRUE),
#   # widths = c(12, 12)
# )

# bscols(cross_oa_ft)


```
## Idea: Publishers' OA vs. Year
## Question: Did OA business models of the top 25% publishers change per year?
## Observation:
```{r  }

oa_time <- function(pub_name) {
  pub_ft <- filter(filtered, filtered$publisher == pub_name)
  
  pub_oa <- pub_ft %>% 
    ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
    geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
    labs(title = paste(pub_name, "'s OA Through the Years", sep = ""), 
         x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
    theme(axis.text = element_text(size = rel(0.75))) +
    scale_x_discrete(limits=c(2017, 2018, 2019)) +
    scale_color_brewer(palette = "Set1")

  ggplotly(pub_oa)
  
}

top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")

oa_time("transcript Verlag")

oa_time("Duke University Press")

oa_time("University of Michigan Press")

oa_time("Manchester University Press")

oa_time("Pluto Press")

oa_time("Liverpool University Press")

```
## Idea: Revenue vs. OA
## Question: What total revenue are publishers receiving each year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revenue_finder <- function(pub_name) {
  pub_filtered <- filter(filtered, filtered$publisher == pub_name)
  rev = sum(pub_filtered$euro)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()

for (i in top25_list) {
  revenue_list<-c(revenue_list,revenue_finder(i))
}

revenue_df$revenue <- c(revenue_list)
print(revenue_df)

# ggplot:
publisher_revenue <- revenue_df %>%
  ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
  geom_col() +
  labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_fill_brewer(palette = "Set1")

ggplotly(publisher_revenue)


```
## Idea: Revenue vs. OA
## Question: What revenue are publishers receiving per year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

revlist <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)

print(revenue_df)

revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')

# ggplot:
pub_year_revenue1 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue1)

pub_year_revenue2 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue2)

pub_year_revenue3 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue3)

revenue_df %>% gather('2017','2018','2019', key="year", value="Euro") %>% ggplot(aes(x=publisher,y=Euro),fill=publisher) + geom_bar(stat="identity") + facet_wrap("year")

```
### Continued, tried putting it into one graph. 
```{r}

revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revlist <- c(revlist_2017, revlist_2018, revlist_2019)

print(revlist)

nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)

print(nrev)

#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`

pub_year_rev <- nrev %>%
  
  ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_rev)

```
## Idea: DOAB analysis
## Question: What is the average time gap between year of publication and added on date? 
## Observation: 
```{r}

DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
print(gap)

```
### Comparison of charges by year and backlist
```{r}
# create faceted plot object
charges.plot <- KUbpc.df %>% ggplot(aes(euro))+geom_histogram(bins=6)+facet_grid(rows=vars(period), cols = vars(backlist_oa))


## Present as Standard plot
 plot(charges.plot)

# this plot will render publicly https://htmlpreview.github.io/?https://github.com/MIT-Informatics/monograph/blob/master/00%20EDA%20Start.nb.html

```
### Interactive charges exploration
```{r}
 ggplotly(charges.plot)
# https://mit-informatics.github.io/monograph/demo.html

```
```
### Interactive Dataset Exploration 
```
```{r}
KUbpc.df %>% ExPanD(df=.       ,title="KU Book Processing Charges",export_nb_option = TRUE)
# ExPanD uses shiny() which works running R locally, but isn't going to work through github. Could publish through shinyapps.io (low usage only), or export  a non-interactive notebook it
# see: https://drmaltman.shinyapps.io/demo/
```

